In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [3]:
hosp_det = pd.read_csv(r"E:\DATA SCIENCE\CapStone\Project1\Hospitalisation details.csv")
med_exams = pd.read_csv(r"E:\DATA SCIENCE\CapStone\Project1\Medical Examinations.csv")
name_data = pd.read_excel(r"E:\DATA SCIENCE\CapStone\Project1\Names.xlsx")
In [4]:
hosp_det
Out[4]:
Customer ID year month date children charges Hospital tier City tier State ID
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013
... ... ... ... ... ... ... ... ... ...
2338 Id5 1989 Jun 19 0 55135.40 tier - 1 tier - 2 R1012
2339 Id4 1991 Jun 6 1 58571.07 tier - 1 tier - 3 R1024
2340 Id3 1970 ? 11 3 60021.40 tier - 1 tier - 1 R1012
2341 Id2 1977 Jun 8 0 62592.87 tier - 2 tier - 3 R1013
2342 Id1 1968 Oct 12 0 63770.43 tier - 1 tier - 3 R1013

2343 rows × 9 columns

In [5]:
med_exams
Out[5]:
Customer ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker
0 Id1 47.410 7.47 No No No No major surgery yes
1 Id2 30.360 5.77 No No No No major surgery yes
2 Id3 34.485 11.87 yes No No 2 yes
3 Id4 38.095 6.05 No No No No major surgery yes
4 Id5 35.530 5.45 No No No No major surgery yes
... ... ... ... ... ... ... ... ...
2330 Id2331 22.340 5.57 No No No 1 No
2331 Id2332 17.700 6.28 No No No 1 No
2332 Id2333 16.470 6.35 No No Yes 1 No
2333 Id2334 17.600 4.39 No No No 1 No
2334 Id2335 17.580 4.51 No No No 1 No

2335 rows × 8 columns

In [6]:
name_data
Out[6]:
Customer ID name
0 Id1 Hawks, Ms. Kelly
1 Id2 Lehner, Mr. Matthew D
2 Id3 Lu, Mr. Phil
3 Id4 Osborne, Ms. Kelsey
4 Id5 Kadala, Ms. Kristyn
... ... ...
2330 Id2331 Brietzke, Mr. Jordan
2331 Id2332 Riveros Gonzalez, Mr. Juan D. Sr.
2332 Id2333 Albano, Ms. Julie
2333 Id2334 Rosendahl, Mr. Evan P
2334 Id2335 German, Mr. Aaron K

2335 rows × 2 columns

In [7]:
hosp_det.describe()
Out[7]:
date children charges
count 2343.000000 2343.000000 2343.000000
mean 15.554844 1.026035 13559.067870
std 8.721194 1.233847 11922.658415
min 1.000000 0.000000 563.840000
25% 8.000000 0.000000 5084.010000
50% 15.000000 0.000000 9634.540000
75% 23.000000 2.000000 17029.675000
max 30.000000 5.000000 63770.430000
In [8]:
med_exams.describe()
Out[8]:
BMI HBA1C
count 2335.000000 2335.000000
mean 30.972649 6.578998
std 8.742095 2.228731
min 15.010000 4.000000
25% 24.600000 4.900000
50% 30.400000 5.810000
75% 36.300000 7.955000
max 55.050000 12.000000
In [9]:
name_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer ID  2335 non-null   object
 1   name         2335 non-null   object
dtypes: object(2)
memory usage: 36.6+ KB
In [10]:
hosp_det.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2343 entries, 0 to 2342
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer ID    2343 non-null   object 
 1   year           2343 non-null   object 
 2   month          2343 non-null   object 
 3   date           2343 non-null   int64  
 4   children       2343 non-null   int64  
 5   charges        2343 non-null   float64
 6   Hospital tier  2343 non-null   object 
 7   City tier      2343 non-null   object 
 8   State ID       2343 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 164.9+ KB
In [11]:
med_exams.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             2335 non-null   object 
 1   BMI                     2335 non-null   float64
 2   HBA1C                   2335 non-null   float64
 3   Heart Issues            2335 non-null   object 
 4   Any Transplants         2335 non-null   object 
 5   Cancer history          2335 non-null   object 
 6   NumberOfMajorSurgeries  2335 non-null   object 
 7   smoker                  2335 non-null   object 
dtypes: float64(2), object(6)
memory usage: 146.1+ KB

Collate the files so that all the information is in one place¶

In [13]:
dataset = pd.merge(hosp_det, med_exams, on="Customer ID")

dataset = pd.merge(dataset, name_data, on="Customer ID")

dataset.head()
Out[13]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013 17.58 4.51 No No No 1 No German, Mr. Aaron K
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013 17.60 4.39 No No No 1 No Rosendahl, Mr. Evan P
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013 16.47 6.35 No No Yes 1 No Albano, Ms. Julie
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013 17.70 6.28 No No No 1 No Riveros Gonzalez, Mr. Juan D. Sr.
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013 22.34 5.57 No No No 1 No Brietzke, Mr. Jordan
In [14]:
dataset.isnull().sum().sum()
Out[14]:
0
In [15]:
dataset.shape
Out[15]:
(2335, 17)

Percentage of rows that have trivial value, and delete such rows if they do not contain significant information¶

In [17]:
trivial_rows = dataset.isin(['?']).any(axis=1)

num_trivial_rows = trivial_rows.sum()

total_rows = dataset.shape[0]

percentage_trivial = (num_trivial_rows / total_rows)*100 
print(f"Percentage of rows with trivial values: {percentage_trivial*100}%")
Percentage of rows with trivial values: 42.82655246252677%
In [18]:
dataset = dataset[~trivial_rows]

print(dataset.shape)
(2325, 17)

Transform the nominal and ordinal categorical variables¶

In [20]:
from sklearn.preprocessing import LabelEncoder
In [21]:
le = LabelEncoder()
In [22]:
nominal_vars = ['Heart Issues', 'Any Transplants', 'Cancer history', 'smoker']
for var in nominal_vars:
    dataset[var] = le.fit_transform(dataset[var])
In [23]:
hospital_tier_mapping = {'tier - 1': 1, 'tier - 2': 2, 'tier - 3': 3}
city_tier_mapping = {'tier - 1': 1, 'tier - 2': 2, 'tier - 3': 3}

dataset['Hospital tier'] = dataset['Hospital tier'].map(hospital_tier_mapping)
dataset['City tier'] = dataset['City tier'].map(city_tier_mapping)
In [24]:
dataset
Out[24]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name
0 Id2335 1992 Jul 9 0 563.84 2 3 R1013 17.580 4.51 0 0 0 1 0 German, Mr. Aaron K
1 Id2334 1992 Nov 30 0 570.62 2 1 R1013 17.600 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P
2 Id2333 1993 Jun 30 0 600.00 2 1 R1013 16.470 6.35 0 0 1 1 0 Albano, Ms. Julie
3 Id2332 1992 Sep 13 0 604.54 3 3 R1013 17.700 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr.
4 Id2331 1998 Jul 27 0 637.26 3 3 R1013 22.340 5.57 0 0 0 1 0 Brietzke, Mr. Jordan
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2329 Id6 1962 Aug 4 0 52590.83 1 3 R1011 32.800 6.59 0 0 0 No major surgery 1 Baker, Mr. Russell B.
2330 Id5 1989 Jun 19 0 55135.40 1 2 R1012 35.530 5.45 0 0 0 No major surgery 1 Kadala, Ms. Kristyn
2331 Id4 1991 Jun 6 1 58571.07 1 3 R1024 38.095 6.05 0 0 0 No major surgery 1 Osborne, Ms. Kelsey
2333 Id2 1977 Jun 8 0 62592.87 2 3 R1013 30.360 5.77 0 0 0 No major surgery 1 Lehner, Mr. Matthew D
2334 Id1 1968 Oct 12 0 63770.43 1 3 R1013 47.410 7.47 0 0 0 No major surgery 1 Hawks, Ms. Kelly

2325 rows × 17 columns

Given the imbalanced representation of states (16) and potential insignificance of many dummy variables, focus on R1011, R1012, and R1013 for further analysis. Develop a strategy to create dummy variables for these specific regions.¶

In [26]:
dataset['State ID'].value_counts()
Out[26]:
State ID
R1013    609
R1011    574
R1012    572
R1024    159
R1026     84
R1021     70
R1016     64
R1025     40
R1023     38
R1017     36
R1019     26
R1022     14
R1014     13
R1015     11
R1018      9
R1020      6
Name: count, dtype: int64
In [27]:
dataset['group_state'] = np.where(dataset['State ID'].isin(['R1011', 'R1012', 'R1013']), dataset['State ID'], 'other')
In [28]:
dummie_state = pd.get_dummies(dataset['group_state'], prefix='state')
df = pd.concat([dataset,dummie_state],axis=1)
dataset
Out[28]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name group_state
0 Id2335 1992 Jul 9 0 563.84 2 3 R1013 17.580 4.51 0 0 0 1 0 German, Mr. Aaron K R1013
1 Id2334 1992 Nov 30 0 570.62 2 1 R1013 17.600 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P R1013
2 Id2333 1993 Jun 30 0 600.00 2 1 R1013 16.470 6.35 0 0 1 1 0 Albano, Ms. Julie R1013
3 Id2332 1992 Sep 13 0 604.54 3 3 R1013 17.700 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr. R1013
4 Id2331 1998 Jul 27 0 637.26 3 3 R1013 22.340 5.57 0 0 0 1 0 Brietzke, Mr. Jordan R1013
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2329 Id6 1962 Aug 4 0 52590.83 1 3 R1011 32.800 6.59 0 0 0 No major surgery 1 Baker, Mr. Russell B. R1011
2330 Id5 1989 Jun 19 0 55135.40 1 2 R1012 35.530 5.45 0 0 0 No major surgery 1 Kadala, Ms. Kristyn R1012
2331 Id4 1991 Jun 6 1 58571.07 1 3 R1024 38.095 6.05 0 0 0 No major surgery 1 Osborne, Ms. Kelsey other
2333 Id2 1977 Jun 8 0 62592.87 2 3 R1013 30.360 5.77 0 0 0 No major surgery 1 Lehner, Mr. Matthew D R1013
2334 Id1 1968 Oct 12 0 63770.43 1 3 R1013 47.410 7.47 0 0 0 No major surgery 1 Hawks, Ms. Kelly R1013

2325 rows × 18 columns

In [29]:
dataset = dataset[dataset["State ID"].isin(['R1011', 'R1012', 'R1013'])] 
In [30]:
dataset.shape # 609+574+572 = 1755
Out[30]:
(1755, 18)
In [31]:
le = LabelEncoder()
In [32]:
dataset.loc[:, 'State ID'] = le.fit_transform(dataset['State ID']).astype(int) #explicitly casting the output of fit_transform()
In [33]:
dataset['State ID'].unique()
Out[33]:
array([2, 1, 0], dtype=object)
In [34]:
replace_map = {'R1011': 1, 'R1012': 2, 'R1013': 3, 'other': 0}

dataset.loc[:, 'group_state'] = dataset['group_state'].replace(replace_map).astype(int)
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\1127126202.py:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  dataset.loc[:, 'group_state'] = dataset['group_state'].replace(replace_map).astype(int)
In [35]:
dataset
Out[35]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name group_state
0 Id2335 1992 Jul 9 0 563.84 2 3 2 17.58 4.51 0 0 0 1 0 German, Mr. Aaron K 3
1 Id2334 1992 Nov 30 0 570.62 2 1 2 17.60 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P 3
2 Id2333 1993 Jun 30 0 600.00 2 1 2 16.47 6.35 0 0 1 1 0 Albano, Ms. Julie 3
3 Id2332 1992 Sep 13 0 604.54 3 3 2 17.70 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr. 3
4 Id2331 1998 Jul 27 0 637.26 3 3 2 22.34 5.57 0 0 0 1 0 Brietzke, Mr. Jordan 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2328 Id7 1994 Oct 27 1 51194.56 1 3 0 36.40 6.07 0 0 0 No major surgery 1 Macpherson, Mr. Scott 1
2329 Id6 1962 Aug 4 0 52590.83 1 3 0 32.80 6.59 0 0 0 No major surgery 1 Baker, Mr. Russell B. 1
2330 Id5 1989 Jun 19 0 55135.40 1 2 1 35.53 5.45 0 0 0 No major surgery 1 Kadala, Ms. Kristyn 2
2333 Id2 1977 Jun 8 0 62592.87 2 3 2 30.36 5.77 0 0 0 No major surgery 1 Lehner, Mr. Matthew D 3
2334 Id1 1968 Oct 12 0 63770.43 1 3 2 47.41 7.47 0 0 0 No major surgery 1 Hawks, Ms. Kelly 3

1755 rows × 18 columns

The 'NumberOfMajorSurgeries' variable contains string values. Apply an appropriate method to clean and convert this variable to a numerical format.¶

In [37]:
replace_map = {'No major surgery': 0}

dataset.loc[:, "NumberOfMajorSurgeries"] = dataset["NumberOfMajorSurgeries"].replace(replace_map)
In [38]:
dataset
Out[38]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name group_state
0 Id2335 1992 Jul 9 0 563.84 2 3 2 17.58 4.51 0 0 0 1 0 German, Mr. Aaron K 3
1 Id2334 1992 Nov 30 0 570.62 2 1 2 17.60 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P 3
2 Id2333 1993 Jun 30 0 600.00 2 1 2 16.47 6.35 0 0 1 1 0 Albano, Ms. Julie 3
3 Id2332 1992 Sep 13 0 604.54 3 3 2 17.70 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr. 3
4 Id2331 1998 Jul 27 0 637.26 3 3 2 22.34 5.57 0 0 0 1 0 Brietzke, Mr. Jordan 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2328 Id7 1994 Oct 27 1 51194.56 1 3 0 36.40 6.07 0 0 0 0 1 Macpherson, Mr. Scott 1
2329 Id6 1962 Aug 4 0 52590.83 1 3 0 32.80 6.59 0 0 0 0 1 Baker, Mr. Russell B. 1
2330 Id5 1989 Jun 19 0 55135.40 1 2 1 35.53 5.45 0 0 0 0 1 Kadala, Ms. Kristyn 2
2333 Id2 1977 Jun 8 0 62592.87 2 3 2 30.36 5.77 0 0 0 0 1 Lehner, Mr. Matthew D 3
2334 Id1 1968 Oct 12 0 63770.43 1 3 2 47.41 7.47 0 0 0 0 1 Hawks, Ms. Kelly 3

1755 rows × 18 columns

Age is likely a significant factor in this analysis. Calculate the age of each patient based on their date of birth.¶

In [40]:
month_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
dataset.loc[:, 'month'] = dataset['month'].map(month_dict)
In [41]:
dataset.loc[:, 'year'] = dataset['year'].astype(int)
In [42]:
current_year = 2023
dataset.loc[:, 'age'] = current_year - dataset['year']
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\3994024914.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.loc[:, 'age'] = current_year - dataset['year']
In [43]:
dataset
Out[43]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name group_state age
0 Id2335 1992 7 9 0 563.84 2 3 2 17.58 4.51 0 0 0 1 0 German, Mr. Aaron K 3 31
1 Id2334 1992 11 30 0 570.62 2 1 2 17.60 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P 3 31
2 Id2333 1993 6 30 0 600.00 2 1 2 16.47 6.35 0 0 1 1 0 Albano, Ms. Julie 3 30
3 Id2332 1992 9 13 0 604.54 3 3 2 17.70 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr. 3 31
4 Id2331 1998 7 27 0 637.26 3 3 2 22.34 5.57 0 0 0 1 0 Brietzke, Mr. Jordan 3 25
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2328 Id7 1994 10 27 1 51194.56 1 3 0 36.40 6.07 0 0 0 0 1 Macpherson, Mr. Scott 1 29
2329 Id6 1962 8 4 0 52590.83 1 3 0 32.80 6.59 0 0 0 0 1 Baker, Mr. Russell B. 1 61
2330 Id5 1989 6 19 0 55135.40 1 2 1 35.53 5.45 0 0 0 0 1 Kadala, Ms. Kristyn 2 34
2333 Id2 1977 6 8 0 62592.87 2 3 2 30.36 5.77 0 0 0 0 1 Lehner, Mr. Matthew D 3 46
2334 Id1 1968 10 12 0 63770.43 1 3 2 47.41 7.47 0 0 0 0 1 Hawks, Ms. Kelly 3 55

1755 rows × 19 columns

To potentially identify factors influencing hospitalization costs, we'll create a new field for the beneficiary's gender. This can be inferred from the salutations present in their name.¶

In [45]:
dataset.loc[:, 'Gender'] = dataset['name'].apply(lambda x: '0' if 'Mr.' in x else '1')

dataset.head()
C:\Users\Asus\AppData\Local\Temp\ipykernel_12196\2122272058.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.loc[:, 'Gender'] = dataset['name'].apply(lambda x: '0' if 'Mr.' in x else '1')
Out[45]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name group_state age Gender
0 Id2335 1992 7 9 0 563.84 2 3 2 17.58 4.51 0 0 0 1 0 German, Mr. Aaron K 3 31 0
1 Id2334 1992 11 30 0 570.62 2 1 2 17.60 4.39 0 0 0 1 0 Rosendahl, Mr. Evan P 3 31 0
2 Id2333 1993 6 30 0 600.00 2 1 2 16.47 6.35 0 0 1 1 0 Albano, Ms. Julie 3 30 1
3 Id2332 1992 9 13 0 604.54 3 3 2 17.70 6.28 0 0 0 1 0 Riveros Gonzalez, Mr. Juan D. Sr. 3 31 0
4 Id2331 1998 7 27 0 637.26 3 3 2 22.34 5.57 0 0 0 1 0 Brietzke, Mr. Jordan 3 25 0

To visualize the distribution of costs, create a histogram, box plot, and swarm plot.¶

In [47]:
plt.figure(figsize=(13,6))
ax = sns.histplot(dataset["charges"], kde=False, bins=20)

for p in ax.patches:
    height = p.get_height()
    x = p.get_x() + p.get_width() / 2
    ax.text(x, height + 2, f'{int(height)}', ha='center', va='bottom', fontsize=13)

plt.title('Distribution of costs')
plt.show()
No description has been provided for this image
In [48]:
plt.figure(figsize=(9,4))
sns.boxplot(x=dataset["charges"])

plt.title('Distribution of costs')

plt.show()
No description has been provided for this image

Analyze how the distribution of costs varies across different genders and hospital tiers¶

In [50]:
plt.figure(figsize=(15,8))
sns.swarmplot(dataset["charges"])
plt.title("Distribution of costs")
plt.show()
No description has been provided for this image
In [51]:
plt.figure(figsize=(15,8))
sns.swarmplot(x = 'charges', y = 'Gender', data = dataset)
plt.title('Swarm Plot of Charges by Gender')
plt.show()
No description has been provided for this image
In [52]:
plt.figure(figsize=(15,5))
sns.violinplot(x = "City tier", y = "charges", data = dataset)
plt.title('Violin Plot of Charges by City Tier')
plt.show()
No description has been provided for this image

Create a radar chart to visualize the median hospitalization cost for each hospital tier.¶

In [54]:
import plotly.express as ex
median = dataset.groupby('Hospital tier')[['charges']].median().reset_index()
In [55]:
fig = ex.line_polar(median,r='charges',theta='Hospital tier',line_close=True,title='Median Hospitalization Cost by Hospital Tier')
fig.show()

Create a frequency table and a stacked bar chart to visualize the distribution of people across different city tiers and hospital tiers.¶

In [57]:
Freq_table = pd.crosstab(dataset['City tier'],dataset['Hospital tier'])
Freq_table
Out[57]:
Hospital tier 1 2 3
City tier
1 64 317 160
2 89 365 157
3 91 348 164
In [58]:
Freq_table.plot(kind='bar',stacked=True)
plt.xlabel('City tiers')
plt.ylabel('Hospital tiers')
plt.title('Patient Distribution by City and Hospital Tier')
for idx, tier in enumerate(Freq_table.index):
    bottom = np.zeros(len(Freq_table.columns))
    for col in Freq_table.columns:
        count = Freq_table.loc[tier, col]
        plt.text(idx, bottom[idx] + count / 2, str(count), ha='center', va='center', color='white', fontsize=10)
        bottom[idx] += count
plt.show()
No description has been provided for this image

Test the following null hypotheses:¶

A. The average hospitalization costs for the three types of hospitals are not significantly different¶

In [60]:
import scipy.stats as stats

print('Null Hypothesis : Average hospitalization costs for the three types of hospitals are not significantly different.')
print()

# ANOVA
f_val, p_val = stats.f_oneway(
    dataset[dataset['Hospital tier'] == 1]['charges'],
    dataset[dataset['Hospital tier'] == 2]['charges'],
    dataset[dataset['Hospital tier'] == 3]['charges']
)

print('P-value:', p_val)
print()

if p_val < 0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")
Null Hypothesis : Average hospitalization costs for the three types of hospitals are not significantly different.

P-value: 5.599708129057081e-137

Reject null hypothesis

B. The average hospitalization costs for the three types of cities are not significantly different¶

In [62]:
print('Null Hypothesis : The average hospitalization costs for the three types of cities are not significantly different.')
print()

# ANOVA
f_val, p_val = stats.f_oneway(
    dataset[dataset['City tier'] == 1]['charges'],
    dataset[dataset['City tier'] == 2]['charges'],
    dataset[dataset['City tier'] == 3]['charges']
)

print('P-value:', p_val)
print()

if p_val < 0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")
Null Hypothesis : The average hospitalization costs for the three types of cities are not significantly different.

P-value: 0.27860357802694374

Accept null hypothesis

C. The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers¶

In [64]:
print('Null Hypothesis : The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers.')
print()

smokers_charges = dataset[dataset['smoker'] == 1]['charges']
nonsmokers_charges = dataset[dataset['smoker'] == 0]['charges']

t_val, p_val = stats.ttest_ind(smokers_charges, nonsmokers_charges, equal_var=False)

print('P-value:', p_val)
print()

if p_val < 0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")
Null Hypothesis : The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers.

P-value: 4.3732993882662876e-229

Reject null hypothesis

D. Smoking and heart issues are independent¶

In [66]:
print('Null Hypothesis : Smoking and heart issues are independent.')
print()

contingency_table = pd.crosstab(dataset['smoker'], dataset['Heart Issues'])

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print('Chi-square Statistic:', chi2)
print()

print('P-value:', p)
print()


if p < 0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")
Null Hypothesis : Smoking and heart issues are independent.

Chi-square Statistic: 0.012577038637804683

P-value: 0.9107065371179246

Accept null hypothesis

Machine Learning Part¶

1. Identify and visualize highly correlated predictors using a heatmap.¶

In [69]:
dataset = dataset.loc[:, ~dataset.columns.isin(['Customer ID', 'name'])]
In [122]:
dataset
Out[122]:
year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker group_state age Gender
0 1992 7 9 0 563.84 2 3 2 17.58 4.51 0 0 0 1 0 3 31 0
1 1992 11 30 0 570.62 2 1 2 17.60 4.39 0 0 0 1 0 3 31 0
2 1993 6 30 0 600.00 2 1 2 16.47 6.35 0 0 1 1 0 3 30 1
3 1992 9 13 0 604.54 3 3 2 17.70 6.28 0 0 0 1 0 3 31 0
4 1998 7 27 0 637.26 3 3 2 22.34 5.57 0 0 0 1 0 3 25 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2328 1994 10 27 1 51194.56 1 3 0 36.40 6.07 0 0 0 0 1 1 29 0
2329 1962 8 4 0 52590.83 1 3 0 32.80 6.59 0 0 0 0 1 1 61 0
2330 1989 6 19 0 55135.40 1 2 1 35.53 5.45 0 0 0 0 1 2 34 1
2333 1977 6 8 0 62592.87 2 3 2 30.36 5.77 0 0 0 0 1 3 46 0
2334 1968 10 12 0 63770.43 1 3 2 47.41 7.47 0 0 0 0 1 3 55 1

1755 rows × 18 columns

In [154]:
plt.figure(figsize=(15, 16))
sns.heatmap(dataset.corr(), square=True, annot=True,linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show() #Here, we'll have values  b/w -1 to 1
No description has been provided for this image

2. Develop a regression model Linear or Ridge. Evaluate the model with k-fold cross validation.¶

Also, ensure that you apply all the following suggestions:

• Implement the stratified 5-fold cross validation technique for both model building and validation

• Utilize effective standardization techniques and hyperparameter tuning

• Incorporate sklearn-pipelines to streamline the workflow

• Apply appropriate regularization techniques to address the bias-variance trade-off

• Create five folds in the data, and introduce a variable to identify the folds

• Develop Gradient Boost model and determine the variable importance scores, and identify the redundant variables

In [158]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
In [160]:
X = dataset.drop(columns=['charges']) #bcz we have to predict charges
y = dataset['charges']
In [164]:
y
Out[164]:
0         563.84
1         570.62
2         600.00
3         604.54
4         637.26
          ...   
2328    51194.56
2329    52590.83
2330    55135.40
2333    62592.87
2334    63770.43
Name: charges, Length: 1755, dtype: float64
In [168]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
In [170]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
In [174]:
linear_pipeline = Pipeline([('scaler',StandardScaler()),('linear_regression',LinearRegression())])
In [176]:
linear_pipeline
Out[176]:
Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_regression', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_regression', LinearRegression())])
StandardScaler()
LinearRegression()
In [178]:
ridge_pipeline = Pipeline([('scaler',StandardScaler()),('ridge_regression',Ridge())])
In [180]:
ridge_pipeline
Out[180]:
Pipeline(steps=[('scaler', StandardScaler()), ('ridge_regression', Ridge())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('ridge_regression', Ridge())])
StandardScaler()
Ridge()
In [212]:
ridge_params = {'ridge_regression__alpha':[0.1,1.0,10.0,20.0,100.0]}
In [214]:
ridge_grid_search = GridSearchCV(ridge_pipeline,ridge_params,cv=kf)
In [220]:
linear_scores = []
ridge_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    linear_pipeline.fit(X_train, y_train)
    y_pred_linear = linear_pipeline.predict(X_test)
    linear_scores.append(mean_squared_error(y_test, y_pred_linear))
    
    ridge_grid_search.fit(X_train, y_train)
    y_pred_ridge = ridge_grid_search.predict(X_test)
    ridge_scores.append(mean_squared_error(y_test, y_pred_ridge))

# Calculate and print the average RMSE
print(f'Linear Regression RMSE: {np.mean(linear_scores)**0.5:.4f}')
print(f'Ridge Regression RMSE: {np.mean(ridge_scores)**0.5:.4f}')
print(f'Best Ridge Alpha: {ridge_grid_search.best_params_["ridge_regression__alpha"]}')
Linear Regression RMSE: 4458.6490
Ridge Regression RMSE: 4458.5754
Best Ridge Alpha: 1.0
In [298]:
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
In [300]:
gbr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('GBR', GradientBoostingRegressor(random_state=42))
])
In [302]:
param_distributions = {
    'GBR__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'GBR__learning_rate': uniform(0.01, 0.2),
    'GBR__n_estimators': [100, 200],
    'GBR__max_depth': [3, 4, 5],
    'GBR__criterion': ['friedman_mse', 'squared_error']
}
In [304]:
gbr_random_search = RandomizedSearchCV(
    gbr_pipeline, 
    param_distributions, 
    cv=kf, 
    n_jobs=-1,
    scoring='neg_mean_squared_error', 
    n_iter=10, 
    random_state=42
)
In [306]:
gbr_scores = []


for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
 
    gbr_random_search.fit(X_train, y_train)
    y_pred_gbr = gbr_random_search.predict(X_test)
    gbr_scores.append(mean_squared_error(y_test, y_pred_gbr))


print(f'Gradient Boosting Regressor RMSE: {np.mean(gbr_scores)**0.5:.4f}')
print(f'Best GBR Parameters: {gbr_random_search.best_params_}')
Gradient Boosting Regressor RMSE: 3541.0587
Best GBR Parameters: {'GBR__criterion': 'squared_error', 'GBR__learning_rate': 0.04636499344142013, 'GBR__loss': 'squared_error', 'GBR__max_depth': 3, 'GBR__n_estimators': 200}

3. Case scenario:¶

Estimate the cost of hospitalization for Christopher, Ms. Jayna (Date of birth 12/28/1988; height 170 cm; and weight 85 kgs). She lives with her partner and two children in a tier-1 city, and her state’s State ID is R1011. She was found to be nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be estimated using tier-1 hospitals.

In [313]:
from datetime import datetime

dob = datetime.strptime('12/28/1988', '%m/%d/%Y')
today = datetime.today()
age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
In [319]:
df = pd.DataFrame({
    'year': [1988], 
    'month': [12], 
    'date': [28],  
    'city_tier': [1],  
    'children': [2],  
    'HbA1c': [5.8],  
    'smoker': [1],  
    'Heart Issues': [0],  
    'Any Transplants': [0],  
    'NumberOfMajorSurgeries': [0],  
    'Cancer history': [1], 
    'Hospital tier': [1],  
    'BMI': [85 / (1.70 ** 2)],  
    'age': [age],  
    'Gender': [0],  
    'state_group': [1],  
    'State ID_R1011': [1]  
})
In [321]:
df
Out[321]:
year month date city_tier children HbA1c smoker Heart Issues Any Transplants NumberOfMajorSurgeries Cancer history Hospital tier BMI age Gender state_group State ID_R1011
0 1988 12 28 1 2 5.8 1 0 0 0 1 1 29.411765 35 0 1 1
In [327]:
# training data has specific columns and some columns might be missing in df, we need to handle that.
for col in X.columns:
    if col not in df.columns:
        df[col] = 0  

# I Reorder columns to match the training data
df = df[X.columns]
In [331]:
# Ridge regression prediction
ridge_pred = ridge_grid_search.predict(df)[0]

# Gradient Boosting regression prediction
gbr_pred = gbr_random_search.predict(df)[0]

print(f'Predicted Hospitalization Cost using Ridge Regression: {ridge_pred:.2f} /-')
print(f'Predicted Hospitalization Cost using Gradient Boosting: {gbr_pred:.2f} /-')
Predicted Hospitalization Cost using Ridge Regression: 31814.93 /-
Predicted Hospitalization Cost using Gradient Boosting: 24107.71 /-
In [ ]: